knitr::opts_chunk$set(message = FALSE)
library(bslib)
library(dplyr)
library(DT)
library(ggplot2)
library(glue)
library(here)
library(lubridate)
library(plotly)
library(purrr)
library(readr)
library(rlang)
library(stringr)
library(tidyr)
theme_set(theme_bw())
input_dir <- params$input_dir # here("data")
aggregated_filetypes <- c("blamematrix", "catalog", "mimeo")
# TODO: only load last N weeks of data to keep RAM usage reasonably low
user_dat <- tibble(filename = list.dirs(input_dir) %>%
Filter(function(x) {
x != input_dir
}, .) %>%
lapply(function(x) {
list.files(x, full.names = TRUE)
}) %>%
unlist()) %>%
filter(!str_detect(filename, paste(aggregated_filetypes, collapse = "|"))) %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("date", "path", "username", "file", "ext"),
too_few = "debug"
) %>%
filter(
str_detect(ext, "tsv|txt"), # only keep tab-delimited files
!str_detect(username, "[0-9]"), # filter out numeric usernames
username != "allusers" # filter out the 'allusers' rows
) %>%
mutate(date = as_date(basename(date)))
## Warning: Debug mode activated: adding variables `filename_ok`, `filename_pieces`,
## and `filename_remainder`.
dates <- user_dat %>%
pull(date) %>%
unique()
most_recent_date <- dates %>% max()
usernames <- user_dat %>%
pull(username) %>%
unique()
user_dat %>% write_tsv(here("results", glue("user-dat_{today()}.tsv")))
Disk usage in /data/CCBR on Biowulf
summary_dat_recent <- user_dat %>%
filter( # username %in% users_filter,
date == most_recent_date, file == "summary"
) %>%
pull(filename) %>%
map(function(x) {
read_tsv(x) %>% mutate(filename = x)
}) %>%
list_rbind() %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("basepath", "path", "username", "file", "ext")
) %>%
filter(FolderPath == "/data/CCBR")
summary_dat_recent %>% write_tsv(here("results", glue("summary-dat-recent_{today()}.tsv")))
summary_metrics <- summary_dat_recent %>%
pivot_longer(where(is.numeric), names_to = "metric") %>%
pull(metric) %>%
unique()
top_users <- summary_dat_recent %>%
pivot_longer(all_of(summary_metrics),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
group_by(metric) %>%
slice_max(order_by = value_adj, n = 10) %>%
pull(username) %>%
unique()
plots <- summary_metrics %>% lapply(function(y_metric) {
user_order <- summary_dat_recent %>%
filter(username %in% top_users) %>%
pivot_longer(where(is.numeric),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
filter(metric == y_metric) %>%
arrange(by = value_adj) %>%
pull(username)
p <- summary_dat_recent %>%
filter(username %in% top_users) %>%
mutate(username = factor(username, levels = user_order)) %>%
ggplot(aes(
x = eval_tidy(data_sym(y_metric)),
y = username,
fill = eval_tidy(data_sym(y_metric)),
text = glue("{username}\n{y_metric}\n{FolderPath}")
)) +
geom_col() +
labs(x = y_metric, y = "") +
theme(legend.position = "none")
nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
do.call(navset_pill_list, plots)
summary_dat_all <- user_dat %>%
filter( # username %in% users_filter,
file == "summary"
) %>%
pull(filename) %>%
map(function(x) {
read_tsv(x) %>% mutate(filename = x)
}) %>%
list_rbind() %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("basepath", "path", "username", "file", "ext")
) %>%
mutate(date = str_replace(basepath, ".*/", "") %>% as_date()) %>%
filter(FolderPath == "/data/CCBR") # TODO: repeat for /data/CCBR_Pipeliner
summary_dat_all %>% write_tsv(here("results", glue("summary-dat-all_{today()}.tsv")))
top_users <- summary_dat_all %>%
pivot_longer(all_of(summary_metrics),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
group_by(metric) %>%
slice_max(order_by = value_adj, n = 10) %>%
pull(username) %>%
unique()
plots <- summary_metrics %>% lapply(function(y_metric) {
user_order <- summary_dat_all %>%
filter(username %in% top_users) %>%
pivot_longer(all_of(summary_metrics),
names_to = "metric"
) %>%
mutate(value_adj = case_when(
str_detect(metric, "[sS]core") ~ -value,
TRUE ~ value
)) %>%
filter(metric == y_metric) %>%
arrange(by = value_adj) %>%
pull(username)
p <- summary_dat_all %>%
filter(username %in% user_order) %>%
ggplot(aes(date, eval_tidy(data_sym(y_metric)),
color = username
)) +
geom_line(alpha = 0.7) +
geom_point(aes(text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}"))) +
labs(y = y_metric)
nav_panel(title = y_metric, card_header(y_metric), ggplotly(p, tooltip = "text"))
})
## Warning in geom_point(aes(text = glue("{username}\n{y_metric}\n{FolderPath}\n{date}"))): Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
## Ignoring unknown aesthetics: text
do.call(navset_pill_list, plots)
grub_dat <- list.files(
here(
input_dir,
format(most_recent_date, format = "%Y%m%d")
),
full.names = TRUE
) %>%
Filter(function(x) str_detect(x, "_data_CCBR\\..*\\.grubbers\\.tsv"), .) %>%
map(function(x) {
read_tsv(x, col_names = FALSE) %>%
mutate(filename = x)
}) %>%
list_rbind() %>%
rename(
file_hash = X1,
file_count = X2,
total_disk_usage = X3,
single_disk_usage = X4,
filepaths = X5
) %>%
separate_wider_delim(filename,
delim = ".", cols_remove = FALSE,
names = c("date", "path", "username", "file", "ext")
) %>%
mutate(date = as_date(basename(date))) %>%
filter(
!str_detect(username, "[0-9]"), # filter out numeric usernames
username != "allusers" # filter out the 'allusers' rows
) %>%
separate_wider_delim(total_disk_usage,
delim = " ",
names = c("total_disk_usage_value", "total_disk_usage_unit"),
cols_remove = FALSE
) %>%
separate_wider_delim(single_disk_usage,
delim = " ",
names = c("single_disk_usage_value", "single_disk_usage_unit"),
cols_remove = FALSE
) %>%
mutate(across(all_of(c("total_disk_usage_value", "single_disk_usage_value")), as.numeric))
grub_dat %>% write_tsv(here("results", glue("grub-dat_{today()}.tsv")))
top_files <- grub_dat %>%
arrange(order_by = desc(total_disk_usage_value)) %>%
select(total_disk_usage_value, username, filepaths) %>%
rename(disk_usage_gb = total_disk_usage_value)
card(card_header("Top files"), datatable(top_files, fillContainer = TRUE))